Additional Mutation Rates
source("func.R")
library("patchwork")
library("ggpubr")
library("rstatix")
ggtree v3.7.2 For help: https://yulab-smu.top/treedata-book/ If you use the ggtree package suite in published research, please cite the appropriate paper(s): Guangchuang Yu, David Smith, Huachen Zhu, Yi Guan, Tommy Tsan-Yuk Lam. ggtree: an R package for visualization and annotation of phylogenetic trees with their covariates and other associated data. Methods in Ecology and Evolution. 2017, 8(1):28-36. doi:10.1111/2041-210X.12628 G Yu. Data Integration, Manipulation and Visualization of Phylogenetic Trees (1st ed.). Chapman and Hall/CRC. 2022. ISBN: 9781032233574 Guangchuang Yu, Tommy Tsan-Yuk Lam, Huachen Zhu, Yi Guan. Two methods for mapping and visualizing associated data on phylogeny using ggtree. Molecular Biology and Evolution. 2018, 35(12):3041-3043. doi:10.1093/molbev/msy194 Attaching package: 'ggtree' Attaching package: 'ggtree' The following object is masked from 'package:ape': rotate Loading required package: maps -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 -- v dplyr 1.1.1 v readr 2.1.4 v forcats 1.0.0 v stringr 1.5.0 v ggplot2 3.5.1 v tibble 3.2.1 v lubridate 1.9.2 v tidyr 1.3.0 v purrr 1.0.1 -- Conflicts ------------------------------------------ tidyverse_conflicts() -- x dplyr::between() masks data.table::between() x tidyr::expand() masks ggtree::expand() x dplyr::filter() masks stats::filter() x dplyr::first() masks data.table::first() x lubridate::hour() masks data.table::hour() x lubridate::isoweek() masks data.table::isoweek() x dplyr::lag() masks stats::lag() x dplyr::last() masks data.table::last() x purrr::map() masks maps::map() x lubridate::mday() masks data.table::mday() x lubridate::minute() masks data.table::minute() x lubridate::month() masks data.table::month() x lubridate::quarter() masks data.table::quarter() x lubridate::second() masks data.table::second() x purrr::transpose() masks data.table::transpose() x lubridate::wday() masks data.table::wday() x lubridate::week() masks data.table::week() x dplyr::where() masks ape::where() x lubridate::yday() masks data.table::yday() x lubridate::year() masks data.table::year() i Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors Attaching package: 'gridExtra' The following object is masked from 'package:dplyr': combine Attaching package: 'ggpubr' The following object is masked from 'package:ggtree': rotate The following object is masked from 'package:ape': rotate Attaching package: 'rstatix' The following object is masked from 'package:stats': filter
Function to get L1 distance of any shared markers
get_l1_r_for_all_combination <- function(i, combos, markerlengths) {
sample_a <- combos$a[i]
sample_b <- combos$b[i]
markerlength_tbl_a <- markerlengths %>%
dplyr::filter(sample == sample_a)
markerlength_tbl_b <- markerlengths %>%
dplyr::filter(sample == sample_b)
common_markers <- intersect(markerlength_tbl_a$marker, markerlength_tbl_b$marker)
markerlengths_a <- markerlength_tbl_a %>%
dplyr::filter(marker%in%common_markers) %>%
arrange(marker) %>%
pull(length)
markerlengths_b <- markerlength_tbl_b %>%
dplyr::filter(marker%in%common_markers) %>%
arrange(marker) %>%
pull(length)
n_markers_a <- length(markerlengths_a)
n_markers_b <- length(markerlengths_b)
if (n_markers_a != n_markers_b) (error)
l1 <- sum(abs(markerlengths_a - markerlengths_b))/n_markers_a
list(a = sample_a, b = sample_b, l1 = l1, marker = n_markers_a)
}
LUAD cell line
# directroy of all results
dir_list_a549 <- list.files("../data/invitro_mu/",
pattern= "A549", full.names = TRUE)
l <- lapply(
dir_list_a549,
get_markerlengths_unsubtracted
)
markerlengths_raw_a549 <- bind_rows(l)
A549
Warning message:
"Using an external vector in selections was deprecated in tidyselect 1.1.0.
i Please use `all_of()` or `any_of()` instead.
# Was:
data %>% select(cols)
# Now:
data %>% select(all_of(cols))
See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>."
markerlengths_a549 <- markerlengths_raw_a549 %>%
group_by(subject, marker) %>%
filter(any(str_detect(sample, "Root2"))) %>%
mutate(length = length - length[sample==str_subset(sample, "Root2")[1]]) %>%
ungroup()
filtered_matrix <- markerlengths_a549 %>%
select(-subject) %>%
filter(sample!="A549Root1") %>%
pivot_wider(names_from = marker,
values_from = length) %>%
column_to_rownames("sample") %>%
as.matrix()
minmax <- max(abs(filtered_matrix), na.rm = TRUE)
plot_title <- "Lung adenocarcinoma (A549)"
options(repr.plot.width=14, repr.plot.height=7)
hm <- pheatmap::pheatmap(filtered_matrix,
clustering_distance_rows = "manhattan",
clustering_distance_cols = "manhattan",
display_numbers = TRUE, number_color = "black", fontsize = 20,
color = colorRampPalette(c("seagreen", "white", "purple3"))(42),
na_col = "yellow", breaks = seq(-minmax, minmax, length.out = 43),
main = plot_title
)
ggsave("../plots/Supplementary_figures/a549_hm.pdf", hm, height=13, width=30)
# find samples
samples <- markerlengths_a549$sample %>% unique
combos_wide <- combn(samples, m= 2) %>% as.data.frame()
## make combo table longer
combos_long <- data.frame(a=as.character(combos_wide[1,]), b=as.character(combos_wide[2,]))
## only use combintions from the same subject
combos <- combos_long %>%
mutate(sample_a=str_extract(a, "^[:upper:]+[:digit:]+"),
sample_b=str_extract(b, "^[:upper:]+[:digit:]+")) %>%
filter(sample_a==sample_b) %>%
dplyr::select(a, b)
l1_l <- lapply(1:nrow(combos), get_l1_r_for_all_combination, combos, markerlengths_a549)
a549_l1_tbl <- rbindlist(l1_l)
a549_data <- a549_l1_tbl %>%
filter(a != "A549Root1", b != "A549Root1") %>%
mutate(divisions = case_when(
a == "A549Root2" & str_detect(b, "_G1_") ~ 21.33756093,
a == "A549Root2" & str_detect(b, "_G2_") ~ 42.16817502,
str_detect(a, "_G1_") & str_detect(b, "_G1_") ~ 21.33756093 * 2,
str_detect(a, "_G2_") & str_detect(b, "_G2_") ~ 42.16817502 * 2,
str_detect(a, "_G1_") & str_detect(b, "_G2_") ~ 21.33756093 + 42.16817502,
str_detect(b, "_G1_") & str_detect(a, "_G2_") ~ 21.33756093 + 42.16817502
), days = case_when(
a == "A549Root2" & str_detect(b, "_G1_") ~ 22,
a == "A549Root2" & str_detect(b, "_G2_") ~ 49,
str_detect(a, "_G1_") & str_detect(b, "_G1_") ~ 22 * 2,
str_detect(a, "_G2_") & str_detect(b, "_G2_") ~ 49 * 2,
str_detect(a, "_G1_") & str_detect(b, "_G2_") ~ 22 + 49,
str_detect(b, "_G1_") & str_detect(a, "_G2_") ~ 22 + 49
))
Bootstrap
# finding combination of samples to bootstrap
get_boot_combos <- function(markerlengths) {
# find samples
samples <- markerlengths %>%
pull(sample) %>%
unique()
combos_wide <- combn(samples, m = 2) %>% as.data.frame()
## make combo table longer
combos_long <- data.frame(a = as.character(combos_wide[1, ]), b = as.character(combos_wide[2, ]))
}
l1_bootstrap <- function(i, combos, markerlengths) {
sample_a <- combos$a[i]
sample_b <- combos$b[i]
markerlength_tbl_a <- markerlengths %>%
dplyr::filter(sample == sample_a)
markerlength_tbl_b <- markerlengths %>%
dplyr::filter(sample == sample_b)
markerlengths_a <- c(markerlength_tbl_a$length)
names(markerlengths_a) <- markerlength_tbl_a$marker
markerlengths_b <- c(markerlength_tbl_b$length)
names(markerlengths_b) <- markerlength_tbl_b$marker
common_markers <- intersect(markerlength_tbl_a$marker, markerlength_tbl_b$marker)
marker_length <- length(common_markers)
boot_markers <- sample(common_markers, marker_length, replace = T)
markerlengths_a <- markerlengths_a[boot_markers]
markerlengths_b <- markerlengths_b[boot_markers]
l1 <- sum(abs(markerlengths_a - markerlengths_b))/marker_length
list(a = sample_a, b = sample_b, l1 = l1, marker = marker_length)
}
# function to the find the MRCA of a specific marker combination
boot_mu <- function(i, subject_markers, boot_combos, generations_tbl) {
l1_comb_boot <- lapply(1:nrow(boot_combos), l1_bootstrap, boot_combos, subject_markers) %>%
bind_rows()
l1_gen_tbl <- l1_comb_boot %>%
left_join(generations_tbl)
coef((lm(l1_gen_tbl$l1 ~ l1_gen_tbl$divisions)))[[2]]
}
bootstrap_mu <- function(markerlengths, generations_tbl) {
# getting combos to bootstrap through
boot_combos <- get_boot_combos(markerlengths)
# number of bootstrap reps
reps <- 1000
mrca_boot <- parallel::mclapply(1:reps, boot_mu, markerlengths, boot_combos, generations_tbl)
return(mrca_boot)
}
a549_just_divs <- a549_data %>%
select(a, b, divisions)
markerlengths_a549_boot <- markerlengths_a549 %>%
filter(sample!="A549Root1")
a549_boot <-bootstrap_mu(markerlengths_a549_boot, a549_just_divs)
quantile(unlist(a549_boot), 0.025) %>%
format(scientific = TRUE)
quantile(unlist(a549_boot), 0.975) %>%
format(scientific = TRUE)
lower_ci_a549 <- sub("e", " %*% 10^", quantile(unlist(a549_boot), 0.025) %>%
format(scientific = TRUE))
upper_ci_a549 <- sub("e", " %*% 10^", quantile(unlist(a549_boot), 0.975) %>%
format(scientific = TRUE))
Manual entering bootstrap values from previous run to reduce computation time.
lower_ci_a549 <- sub("e", " %*% 10^", "8.15e-06")
upper_ci_a549 <- sub("e", " %*% 10^", "5.58e-05")
# calculating mutation rate based on slope
mu_a549 <- coef((lm(a549_data$l1 ~ a549_data$divisions)))[2]
mu_a549_form <- sub("e", " %*% 10^", format(mu_a549, digits=3, scientific = TRUE))
mu_a549_annotation <- paste0("mu == ", mu_a549_form)
mu_ci_a549_annotation <- paste0("(", lower_ci_a549, "-", upper_ci_a549,")")
a549_data %>%
ggplot(aes(divisions, l1)) +
geom_jitter(size = 4,width = 0.8) +
#ggbeeswarm::geom_quasirandom(size = 4,width = 1) +
geom_smooth(method = "lm", size = 3) +
theme_martin() +
labs(
x = "Number of cell divisions",
y = "L1 distance"
) +
annotate("text", 20, 0.04, label = mu_a549_annotation, size = 8, parse = TRUE, hjust = 0) +
annotate("text", 44, 0.04, label = mu_ci_a549_annotation, size = 8, parse = TRUE, hjust = 0) +
stat_cor(aes(label = ..p.label..), size = 8, label.y = 0.037)
ggsave("../plots/Figure2/mu_a549.pdf", width=8)
`geom_smooth()` using formula = 'y ~ x'
markerlengths_a549 %>%
filter(!str_detect(sample, "Root")) %>%
select(sample) %>%
distinct() %>%
mutate(type = ifelse(str_detect(sample, "G1"), "G1", "G2")) %>%
count(type)
| type | n |
|---|---|
| <chr> | <int> |
| G1 | 14 |
| G2 | 15 |
Normal breast epithelial cell line
# directroy of all results
dir_list <- list.files("../data/invitro_mu/",
pattern= "HMEC", full.names = TRUE)
# getting markerlengths without subtracting the normal length
get_markerlengths_hmec <- function(dir) {
subject <- str_split(dir, "/") %>%
purrr::map(tail, n = 1) %>%
unlist() %>%
str_split("_") %>%
purrr::map(1) %>%
unlist()
message(subject)
## path to poly-G raw data directory (marker length files)
marker_dir <- list.files(dir, pattern = "repre_repli", full.names = TRUE)
## load marker lengths and get the average length of each marker in each sample
get_markers <- function(marker_dir) {
marker <- suppressMessages(read_tsv(marker_dir))
# getting the marker name
int_file <- tail(str_split(marker_dir, "/")[[1]], 1)
marker_name <- head(str_split(int_file, "_")[[1]], 1)
cols <- NCOL(marker) +1
marker %>%
rowid_to_column("length") %>%
pivot_longer(cols=c(2:cols), names_to="sample", values_to="Frequency") %>%
group_by(sample) %>%
mutate(Frequency=Frequency/sum(Frequency),
sample=str_remove(sample, "_[1-3]$")) %>%
summarize(length = sum(length * Frequency)) %>%
mutate(marker=marker_name)
}
marker_files <- list.files(marker_dir, full.names = TRUE)
markers <- lapply(marker_files, get_markers)
markers <- bind_rows(markers)
markers <- markers %>%
ungroup %>%
as.data.table
markers$subject <- subject
return(markers)
}
l <- lapply(
dir_list,
get_markerlengths_hmec
)
markerlengths_hmec_raw <- bind_rows(l)
HMEC
markerlengths_hmec <- markerlengths_hmec_raw %>%
group_by(subject, marker) %>%
filter(any(str_detect(sample, "Root1"))) %>%
mutate(length = length - length[sample==str_subset(sample, "Root1")[1]]) %>%
ungroup()
filtered_matrix <- markerlengths_hmec %>%
select(-subject) %>%
pivot_wider(names_from = marker,
values_from = length) %>%
column_to_rownames("sample") %>%
as.matrix()
minmax <- max(abs(filtered_matrix), na.rm = TRUE)
plot_title <- "Normal breast epithelium (HMEC)"
options(repr.plot.width=14, repr.plot.height=7)
hm <- pheatmap::pheatmap(filtered_matrix,
clustering_distance_rows = "manhattan",
clustering_distance_cols = "manhattan",
display_numbers = TRUE, number_color = "black", fontsize = 20,
color = colorRampPalette(c("seagreen", "white", "purple3"))(42),
na_col = "yellow", breaks = seq(-minmax, minmax, length.out = 43),
main = plot_title
)
ggsave("../plots/Supplementary_figures/hmec_hm.pdf", hm, height=13, width=30)
# find samples
samples <- markerlengths_hmec$sample %>% unique
combos_wide <- combn(samples, m= 2) %>% as.data.frame()
## make combo table longer
combos_long <- data.frame(a=as.character(combos_wide[1,]), b=as.character(combos_wide[2,]))
## only use combintions from the same subject
combos <- combos_long #%>%
#filter(a!="HMEC_Root2", b!="HMEC_Root2")
l1_l_hmec <- lapply(1:nrow(combos), get_l1_r_for_all_combination, combos, markerlengths_hmec)
l1_tbl_hmec <- rbindlist(l1_l_hmec) %>%
filter(a != "HMEC_Root2" , b != "HMEC_Root2")
Annotating divisions:
hmec_divs_numbers <- read_tsv("~/HMS Dropbox/Naxerova_Lab/Martin/Wet lab/cell_culture/HMEC_divs_numbers.txt") %>%
mutate(days=days+41)
Rows: 5 Columns: 3 -- Column specification -------------------------------------------------------- Delimiter: "\t" chr (1): sample dbl (2): divs, days -- Column specification -------------------------------------------------------- Delimiter: "\t" chr (1): sample dbl (2): divs, days i Use `spec()` to retrieve the full column specification for this data. i Specify the column types or set `show_col_types = FALSE` to quiet this message.
hmec_data <- l1_tbl_hmec %>%
filter(a!="HMEC_Root2", b!="HMEC_Root2") %>%
mutate(
type_a = str_remove(str_remove(a, "HMEC_"), "_[:digit:].*$"),
type_b = str_remove(str_remove(b, "HMEC_"), "_[:digit:].*$"),
across(type_a:type_b, ~ case_when(.x == "4" ~ "4_G1", .x == "STAT" ~ "STAT_G1", TRUE ~ .x))
) %>%
left_join(hmec_divs_numbers, by = c("type_a" = "sample")) %>%
left_join(hmec_divs_numbers, by = c("type_b" = "sample")) %>%
mutate(across(divs.x:divs.y, ~ replace_na(.x, 0)),
across(days.x:days.y, ~ replace_na(.x, 0)),
divisions = divs.x + divs.y,
day_dist = days.x+days.y,
) %>%
filter(divisions != 0)
hmec_just_divs <- hmec_data %>%
select(a, b, divisions)
markerlengths_hmec_boot <- markerlengths_hmec %>%
filter(sample!="HMEC_Root2")
hmec_boot <- bootstrap_mu(markerlengths_hmec_boot, hmec_just_divs)
quantile(unlist(hmec_boot), 0.025) %>%
format(scientific = TRUE)
quantile(unlist(hmec_boot), 0.975) %>%
format(scientific = TRUE)
lower_ci_hmec <- sub("e", " %*% 10^", quantile(unlist(hmec_boot), 0.025) %>%
format(scientific = TRUE))
upper_ci_hmec <- sub("e", " %*% 10^", quantile(unlist(hmec_boot), 0.975) %>%
format(scientific = TRUE))
Manual entering bootstrap values from previous run to reduce computation time.
lower_ci_hmec <- sub("e", " %*% 10^", "3.02e-05")
upper_ci_hmec <- sub("e", " %*% 10^", "1.40e-04")
# calculating mutation rate based on slope
mu_hmec <- coef((lm(hmec_data$l1 ~ hmec_data$divisions)))[2]
mu_hmec_form <- sub("e", " %*% 10^", format(mu_hmec, digits=3, scientific = TRUE))
mu_hmec_annotation <- paste0("mu == ", mu_hmec_form)
mu_ci_hmec_annotation <- paste0("(", lower_ci_hmec, "-", upper_ci_hmec,")")
hmec_data %>%
ggplot(aes(divisions, l1)) +
geom_jitter(size = 4,width = 0.8) +
#ggbeeswarm::geom_quasirandom(size = 4,width = 1) +
geom_smooth(method = "lm", size = 3) +
theme_martin() +
labs(
x = "Number of cell divisions",
y = "L1 distance"
) +
annotate("text", 20, 0.08, label = mu_hmec_annotation, size = 8, parse = TRUE, hjust = 0) +
annotate("text", 63, 0.08, label = mu_ci_hmec_annotation, size = 8, parse = TRUE, hjust = 0) +
stat_cor(aes(label = ..p.label..), size = 8, label.y = 0.075)
ggsave("../plots/Figure2/mu_hmec.pdf", width=8)
`geom_smooth()` using formula = 'y ~ x' Saving 8 x 7 in image `geom_smooth()` using formula = 'y ~ x'
# directroy of all results
dir_list_rptec <- list.files("../data/invitro_mu/",
pattern= "RPTEC", full.names = TRUE)
l <- lapply(
dir_list_rptec,
get_markerlengths_unsubtracted
)
markerlengths_rptec <- bind_rows(l)
RPTEC
markerlengths_rptec <- markerlengths_rptec %>%
group_by(subject, marker) %>%
filter(any(str_detect(sample, "Root1"))) %>%
mutate(length = length - length[sample==str_subset(sample, "Root1")[1]]) %>%
ungroup()
filtered_matrix <- markerlengths_rptec %>%
select(-subject) %>%
pivot_wider(names_from = marker,
values_from = length) %>%
column_to_rownames("sample") %>%
as.matrix()
minmax <- max(abs(filtered_matrix), na.rm = TRUE)
plot_title <- "Normal kidney epithelium (RPTEC)"
options(repr.plot.width=14, repr.plot.height=7)
hm <- pheatmap::pheatmap(filtered_matrix,
clustering_distance_rows = "manhattan",
clustering_distance_cols = "manhattan",
display_numbers = TRUE, number_color = "black", fontsize = 20,
color = colorRampPalette(c("seagreen", "white", "purple3"))(42),
na_col = "yellow", breaks = seq(-minmax, minmax, length.out = 43),
main = plot_title
)
ggsave("../plots/Supplementary_figures/rptec_hm.pdf", hm, height=13, width=30)
# find samples
samples <- markerlengths_rptec$sample %>% unique
combos_wide <- combn(samples, m= 2) %>% as.data.frame()
## make combo table longer
combos_long <- data.frame(a=as.character(combos_wide[1,]), b=as.character(combos_wide[2,]))
## only use combintions from the same subject
combos <- combos_long %>%
dplyr::select(a, b)
l1_l <- lapply(1:nrow(combos), get_l1_r_for_combination, combos, markerlengths_rptec)
rptec_l1_tbl <- rbindlist(l1_l)
rptec_data <- rptec_l1_tbl %>%
mutate(
divisions = ifelse(a == "RPTEC_Root1" | b == "RPTEC_Root1", 47.44, 47.44 * 2),
days = ifelse(a == "RPTEC_Root1" | b == "RPTEC_Root1", 30, 30 * 2)
)
lm(l1 ~ divisions, data = rptec_data) %>%
summary()
Call:
lm(formula = l1 ~ divisions, data = rptec_data)
Residuals:
Min 1Q Median 3Q Max
-0.0099361 -0.0052185 -0.0003573 0.0037579 0.0110136
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.004e-02 6.552e-03 3.059 0.00914 **
divisions 6.906e-05 7.974e-05 0.866 0.40213
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.006907 on 13 degrees of freedom
Multiple R-squared: 0.05456, Adjusted R-squared: -0.01817
F-statistic: 0.7502 on 1 and 13 DF, p-value: 0.4021
rptec_just_divs <- rptec_data %>%
select(a, b, divisions)
rptec_boot <- bootstrap_mu(markerlengths_rptec, rptec_just_divs)
quantile(unlist(rptec_boot), 0.025) %>%
format(scientific = TRUE)
quantile(unlist(rptec_boot), 0.975) %>%
format(scientific = TRUE)
lower_ci_rptec <- sub("e", " %*% 10^", quantile(unlist(rptec_boot), 0.025) %>%
format(scientific = TRUE))
upper_ci_rptec <- sub("e", " %*% 10^", quantile(unlist(rptec_boot), 0.975) %>%
format(scientific = TRUE))
Manual entering bootstrap values from previous run to reduce computation time.
lower_ci_rptec <- sub("e", " %*% 10^", "-7.53e-05")
upper_ci_rptec <- sub("e", " %*% 10^", "2.02e-04")
# calculating mutation rate based on slope
mu_rptec <- coef((lm(rptec_data$l1 ~ rptec_data$divisions)))[2]
mu_rptec_form <- sub("e", " %*% 10^", format(mu_rptec, digits=3, scientific = TRUE))
mu_rptec_annotation <- paste0("mu == ", mu_rptec_form)
mu_ci_rptec_annotation <- paste0("(", lower_ci_rptec, "-", upper_ci_rptec,")")
rptec_data %>%
ggplot(aes(divisions, l1)) +
geom_jitter(size = 4,width = 0.8) +
#ggbeeswarm::geom_quasirandom(size = 4,width = 1) +
geom_smooth(method = "lm", size = 3) +
theme_martin() +
labs(
x = "Number of cell divisions",
y = "L1 distance"
) +
annotate("text", 45, 0.038, label = mu_rptec_annotation, size = 8, parse = TRUE, hjust = 0) +
annotate("text", 64, 0.038, label = mu_ci_rptec_annotation, size = 8, parse = TRUE, hjust = 0) +
stat_cor(aes(label = ..p.label..), size = 8, label.y = 0.036)
ggsave("../plots/Figure2/mu_rptec.pdf", width=8)
`geom_smooth()` using formula = 'y ~ x' Saving 8 x 7 in image `geom_smooth()` using formula = 'y ~ x'
markerlengths_rptec %>%
filter(!str_detect(sample, "Root")) %>%
pull(sample) %>%
n_distinct()
# directroy of all results
dir_list_ht29 <- list.files("../data/invitro_mu/",
pattern= "HT29", full.names = TRUE)
l <- lapply(
dir_list_ht29,
get_markerlengths_unsubtracted
)
markerlengths_raw_ht29 <- bind_rows(l)
HT29
markerlengths_ht29 <- markerlengths_raw_ht29 %>%
group_by(subject, marker) %>%
filter(any(str_detect(sample, "Root1"))) %>%
mutate(length = length - length[sample==str_subset(sample, "Root1")[1]]) %>%
ungroup()
filtered_matrix <- markerlengths_ht29 %>%
select(-subject) %>%
#filter(sample!="A549Root1") %>%
pivot_wider(names_from = marker,
values_from = length) %>%
column_to_rownames("sample") %>%
as.matrix()
minmax <- max(abs(filtered_matrix), na.rm = TRUE)
plot_title <- "Colon adenocarcinoma (HT29)"
options(repr.plot.width=14, repr.plot.height=7)
hm <- pheatmap::pheatmap(filtered_matrix,
clustering_distance_rows = "manhattan",
clustering_distance_cols = "manhattan",
display_numbers = TRUE, number_color = "black", fontsize = 20,
color = colorRampPalette(c("seagreen", "white", "purple3"))(42),
na_col = "yellow", breaks = seq(-minmax, minmax, length.out = 43),
main = plot_title
)
ggsave("../plots/Supplementary_figures/ht29_hm.pdf", hm, height=13, width=30)
# find samples
samples <- markerlengths_ht29$sample %>% unique
combos_wide <- combn(samples, m= 2) %>% as.data.frame()
## make combo table longer
combos_long <- data.frame(a=as.character(combos_wide[1,]), b=as.character(combos_wide[2,]))
## only use combintions from the same subject
combos <- combos_long %>%
mutate(sample_a=str_extract(a, "^[:upper:]+[:digit:]+"),
sample_b=str_extract(b, "^[:upper:]+[:digit:]+")) %>%
filter(sample_a==sample_b) %>%
dplyr::select(a, b)
l1_l <- lapply(1:nrow(combos), get_l1_r_for_all_combination, combos, markerlengths_ht29)
ht29_l1_tbl <- rbindlist(l1_l)
ht29_data <- ht29_l1_tbl %>%
filter(a != "HT29Root2", b != "HT29Root2") %>%
mutate(divisions = case_when(
a == "HT29Root1" & str_detect(b, "_G1_") ~ 27.49271138,
a == "HT29Root1" & str_detect(b, "_G2_") ~ 45.60432582,
str_detect(a, "_G1_") & str_detect(b, "_G1_") ~ 27.49271138*2,
str_detect(a, "_G2_") & str_detect(b, "_G2_") ~ 45.60432582*2,
str_detect(a, "_G1_") & str_detect(b, "_G2_") ~ 27.49271138+45.60432582,
str_detect(b, "_G1_") & str_detect(a, "_G2_") ~ 27.49271138+45.60432582),
days = case_when(
a == "HT29Root1" & str_detect(b, "_G1_") ~ 44,
a == "HT29Root1" & str_detect(b, "_G2_") ~ 76,
str_detect(a, "_G1_") & str_detect(b, "_G1_") ~ 44*2,
str_detect(a, "_G2_") & str_detect(b, "_G2_") ~ 76*2,
str_detect(a, "_G1_") & str_detect(b, "_G2_") ~ 44+76,
str_detect(b, "_G1_") & str_detect(a, "_G2_") ~ 44+76,
)
)
lm(l1 ~ divisions, data = ht29_data) %>%
summary()
Call:
lm(formula = l1 ~ divisions, data = ht29_data)
Residuals:
Min 1Q Median 3Q Max
-0.0117929 -0.0046101 -0.0009033 0.0037874 0.0206070
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.111e-02 1.336e-03 15.79 <2e-16 ***
divisions 2.475e-05 1.833e-05 1.35 0.178
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.00589 on 433 degrees of freedom
Multiple R-squared: 0.004194, Adjusted R-squared: 0.001895
F-statistic: 1.824 on 1 and 433 DF, p-value: 0.1776
ht29_just_divs <- ht29_data %>%
select(a, b, divisions)
markerlengths_ht29_boot <- markerlengths_ht29 %>%
filter(sample!="HT29Root2")
ht29_boot <- bootstrap_mu(markerlengths_ht29_boot, ht29_just_divs)
quantile(unlist(ht29_boot), 0.025) %>%
format(scientific = TRUE)
quantile(unlist(ht29_boot), 0.975) %>%
format(scientific = TRUE)
lower_ci_ht29 <- sub("e", " %*% 10^", quantile(unlist(ht29_boot), 0.025) %>%
format(scientific = TRUE))
upper_ci_ht29 <- sub("e", " %*% 10^", quantile(unlist(ht29_boot), 0.975) %>%
format(scientific = TRUE))
Manual entering bootstrap values from previous run to reduce computation time.
lower_ci_ht29 <- sub("e", " %*% 10^", "-1.37e-05")
upper_ci_ht29 <- sub("e", " %*% 10^", "6.40e-05")
# calculating mutation rate based on slope
mu_ht29 <- coef((lm(ht29_data$l1 ~ ht29_data$divisions)))[2]
mu_ht29_form <- sub("e", " %*% 10^", format(mu_ht29, digits=3, scientific = TRUE))
mu_ht29_annotation <- paste0("mu == ", mu_ht29_form)
mu_ci_ht29_annotation <- paste0("(", lower_ci_ht29, "-", upper_ci_ht29,")")
ht29_data %>%
ggplot(aes(divisions, l1)) +
geom_jitter(size = 4,width = 10) +
#ggbeeswarm::geom_quasirandom(size = 4,width = 1) +
geom_smooth(method = "lm", size = 3) +
theme_martin() +
labs(
x = "Number of cell divisions",
y = "L1 distance"
) +
annotate("text", 25, 0.048, label = mu_ht29_annotation, size = 8, parse = TRUE, hjust = 0) +
annotate("text", 50, 0.048, label = mu_ci_ht29_annotation, size = 8, parse = TRUE, hjust = 0) +
stat_cor(aes(label = ..p.label..), size = 8, label.y = 0.045)
ggsave("../plots/Figure2/mu_ht29.pdf", width=8)
Warning message: "Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0. i Please use `linewidth` instead." Warning message: "The dot-dot notation (`..p.label..`) was deprecated in ggplot2 3.4.0. i Please use `after_stat(p.label)` instead." `geom_smooth()` using formula = 'y ~ x' Saving 8 x 7 in image `geom_smooth()` using formula = 'y ~ x'